from IPython.display import Image
En este notebook se realiza el estudio y preprocesamiento de las variables numéricas y categoricas. Se realizarán los siguientes pasos:
1. Cambio de tipos de variables
2. Separación en train y test
3. Análisis de cada variable con gráficos descriptivos
4. Para variables numericas: correlaciones de pearnson, estudio de outliers y estudio de valores missing
5. Para variables categoricas: relleno de valores missing, estudio de correlaciones con vCramer
Para los valores outlier por columnas, se tendrá en cuenta los gráficos:
# <img src="Screenshots/analisis_outlier.png">
Image(filename="Screenshots/analisis_outlier.png")
Destro del tratamiento de los valores missing, se elegirá alguno de los siguientes métodos:
Image(filename="Screenshots/missing.png")
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
Buenas prácticas
Voy guardando las funciones que están automatizadas y pienso que me van a servir en otros proyectos en un funciones_auxiliares.py y lo importo:
#import funciones_auxiliares as f_aux
# Si importas las funciones del .py de esta forma, para llamar a una función habría que hacer: f_aux.plot_feature(...)
# y eliminar la siguiente celda de funciones, ya que ya las estaríamos importando desde funciones_auxiliares.py
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the loan status.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix: tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Lectura de los datos y cambio de tipos de variables
pd_loan = pd.read_csv("./data_preprocessing/pd_data_initial_preprocessing.csv")\
.set_index(["id", "url", "zip_code"])\
.drop('emp_title',axis=1)
pd_loan.shape
(442567, 33)
pd_loan.columns
Index(['mths_since_last_record', 'mths_since_recent_bc_dlq',
'mths_since_last_major_derog', 'mths_since_recent_revol_delinq',
'mths_since_last_delinq', 'emp_length', 'mo_sin_old_il_acct',
'revol_util', 'dti', 'loan_status', 'term', 'int_rate', 'installment',
'grade', 'sub_grade', 'mort_acc', 'pub_rec_bankruptcies',
'home_ownership', 'annual_inc', 'verification_status', 'total_acc',
'initial_list_status', 'purpose', 'addr_state', 'fico_range_low',
'fico_range_high', 'mo_sin_old_rev_tl_op', 'application_type',
'open_acc', 'pub_rec', 'loan_amnt', 'earliest_cr_line_month',
'earliest_cr_line_year'],
dtype='object')
list_var_cat, other = dame_variables_categoricas(dataset=pd_loan)
pd_loan[list_var_cat] = pd_loan[list_var_cat].astype("category")
list_var_continuous = list(pd_loan.select_dtypes('float').columns)
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].astype(float)
pd_loan.dtypes
mths_since_last_record float64 mths_since_recent_bc_dlq float64 mths_since_last_major_derog float64 mths_since_recent_revol_delinq float64 mths_since_last_delinq float64 emp_length float64 mo_sin_old_il_acct float64 revol_util float64 dti float64 loan_status category term float64 int_rate float64 installment float64 grade category sub_grade float64 mort_acc float64 pub_rec_bankruptcies float64 home_ownership category annual_inc float64 verification_status category total_acc float64 initial_list_status category purpose category addr_state category fico_range_low float64 fico_range_high float64 mo_sin_old_rev_tl_op float64 application_type category open_acc float64 pub_rec float64 loan_amnt float64 earliest_cr_line_month float64 earliest_cr_line_year float64 dtype: object
pd_plot_loan_status = pd_loan['loan_status']\
.value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
pd_plot_loan_status_conteo = pd_loan['loan_status'].value_counts().reset_index()
pd_plot_loan_status_pc = pd.merge(pd_plot_loan_status,
pd_plot_loan_status_conteo, on=['index'], how='inner')
fig = px.histogram(pd_plot_loan_status_pc, x="index", y=['percent'])
fig.show()
from sklearn.model_selection import train_test_split
X_pd_loan, X_pd_loan_test, y_pd_loan, y_pd_loan_test = train_test_split(pd_loan.drop('loan_status',axis=1),
pd_loan['loan_status'],
stratify=pd_loan['loan_status'],
test_size=0.2)
pd_loan_train = pd.concat([X_pd_loan, y_pd_loan],axis=1)
pd_loan_test = pd.concat([X_pd_loan_test, y_pd_loan_test],axis=1)
print('== Train\n', pd_loan_train['loan_status'].value_counts(normalize=True))
print('== Test\n', pd_loan_test['loan_status'].value_counts(normalize=True))
== Train Fully Paid 0.780719 Charged Off 0.219281 Name: loan_status, dtype: float64 == Test Fully Paid 0.780713 Charged Off 0.219287 Name: loan_status, dtype: float64
Veo el número de valores nulos por filas y por columnas
pd_series_null_columns = pd_loan_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = pd_loan_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)
pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])
pd_null_filas['target'] = pd_loan['loan_status'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_loan_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/pd_loan_train.shape[1]
(33,) (354053,)
pd_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| mths_since_last_record | 287632 | 0.812398 |
| mths_since_recent_bc_dlq | 261211 | 0.737774 |
| mths_since_last_major_derog | 250094 | 0.706374 |
| mths_since_recent_revol_delinq | 226529 | 0.639817 |
| mths_since_last_delinq | 172430 | 0.487017 |
| emp_length | 19198 | 0.054224 |
| mo_sin_old_il_acct | 10146 | 0.028657 |
| revol_util | 170 | 0.000480 |
| dti | 13 | 0.000037 |
| application_type | 0 | 0.000000 |
| fico_range_low | 0 | 0.000000 |
| fico_range_high | 0 | 0.000000 |
| mo_sin_old_rev_tl_op | 0 | 0.000000 |
| earliest_cr_line_month | 0 | 0.000000 |
| open_acc | 0 | 0.000000 |
| pub_rec | 0 | 0.000000 |
| loan_amnt | 0 | 0.000000 |
| purpose | 0 | 0.000000 |
| earliest_cr_line_year | 0 | 0.000000 |
| addr_state | 0 | 0.000000 |
| home_ownership | 0 | 0.000000 |
| initial_list_status | 0 | 0.000000 |
| total_acc | 0 | 0.000000 |
| verification_status | 0 | 0.000000 |
| annual_inc | 0 | 0.000000 |
| pub_rec_bankruptcies | 0 | 0.000000 |
| mort_acc | 0 | 0.000000 |
| sub_grade | 0 | 0.000000 |
| grade | 0 | 0.000000 |
| installment | 0 | 0.000000 |
| int_rate | 0 | 0.000000 |
| term | 0 | 0.000000 |
| loan_status | 0 | 0.000000 |
pd_null_filas.head()
| nulos_filas | target | porcentaje_filas | |||
|---|---|---|---|---|---|
| id | url | zip_code | |||
| 36098610 | https://lendingclub.com/browse/loanDetail.action?loan_id=36098610 | 481xx | 7 | Charged Off | 0.212121 |
| 26170038 | https://lendingclub.com/browse/loanDetail.action?loan_id=26170038 | 291xx | 7 | Fully Paid | 0.212121 |
| 43975682 | https://lendingclub.com/browse/loanDetail.action?loan_id=43975682 | 074xx | 7 | Charged Off | 0.212121 |
| 44995881 | https://lendingclub.com/browse/loanDetail.action?loan_id=44995881 | 484xx | 7 | Fully Paid | 0.212121 |
| 15370337 | https://lendingclub.com/browse/loanDetail.action?loan_id=15370337 | 116xx | 7 | Fully Paid | 0.212121 |
Distribución del resto de variables
# warnings.filterwarnings('ignore')
for i in list(pd_loan_train.columns):
if (pd_loan_train[i].dtype==float) & (i!='loan_status'):
plot_feature(pd_loan_train, col_name=i, isContinuous=True, target='loan_status')
elif i!='loan_status':
plot_feature(pd_loan_train, col_name=i, isContinuous=False, target='loan_status')
A continuación, se tratan los valores missing, las correlaciones de las vairbales continuas y los outlier
list_var_continuous
['mths_since_last_record', 'mths_since_recent_bc_dlq', 'mths_since_last_major_derog', 'mths_since_recent_revol_delinq', 'mths_since_last_delinq', 'emp_length', 'mo_sin_old_il_acct', 'revol_util', 'dti', 'term', 'int_rate', 'installment', 'sub_grade', 'mort_acc', 'pub_rec_bankruptcies', 'annual_inc', 'total_acc', 'fico_range_low', 'fico_range_high', 'mo_sin_old_rev_tl_op', 'open_acc', 'pub_rec', 'loan_amnt', 'earliest_cr_line_month', 'earliest_cr_line_year']
Los valores outlier se pueden sustituir por la media, mediana, valores extremos (media+3std o media-3std). Tras el siguiente análisis, he decidido como primera iteración dejarlos sin sustituir. Una vez llegue al modelo puedo realizar iteraciones utilizando diferentes métodos para comprobar si mejora el modelo
get_deviation_of_mean_perc(pd_loan_train, list_var_continuous, target='loan_status', multiplier=3)
| Fully Paid | Charged Off | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.851852 | 0.148148 | mths_since_recent_bc_dlq | 54 | 0.000153 |
| 1 | 0.75043 | 0.24957 | mths_since_last_major_derog | 581 | 0.001641 |
| 2 | 0.908163 | 0.091837 | mths_since_recent_revol_delinq | 98 | 0.000277 |
| 3 | 0.783784 | 0.216216 | mths_since_last_delinq | 74 | 0.000209 |
| 4 | 0.748167 | 0.251833 | mo_sin_old_il_acct | 2728 | 0.007705 |
| 5 | 0.6 | 0.4 | revol_util | 25 | 0.000071 |
| 6 | 0.905109 | 0.094891 | dti | 137 | 0.000387 |
| 7 | 0.609212 | 0.390788 | int_rate | 1346 | 0.003802 |
| 8 | 0.741897 | 0.258103 | installment | 3394 | 0.009586 |
| 9 | 0.867074 | 0.132926 | mort_acc | 3107 | 0.008776 |
| 10 | 0.746042 | 0.253958 | pub_rec_bankruptcies | 3158 | 0.008920 |
| 11 | 0.861029 | 0.138971 | annual_inc | 3051 | 0.008617 |
| 12 | 0.773154 | 0.226846 | total_acc | 3196 | 0.009027 |
| 13 | 0.935703 | 0.064297 | fico_range_low | 6470 | 0.018274 |
| 14 | 0.935703 | 0.064297 | fico_range_high | 6470 | 0.018274 |
| 15 | 0.777914 | 0.222086 | mo_sin_old_rev_tl_op | 4075 | 0.011510 |
| 16 | 0.732964 | 0.267036 | open_acc | 4153 | 0.011730 |
| 17 | 0.753531 | 0.246469 | pub_rec | 3611 | 0.010199 |
| 18 | 0.778544 | 0.221456 | earliest_cr_line_year | 3915 | 0.011058 |
Se puede observar que, en la variable int_rate los outlier tienen un mayor porcentaje de charged off (asciende de un 22% a un 38.49%)
get_corr_matrix(dataset = pd_loan_train[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
En algunos algoritmos, como los ensembling de árboles (xgboost, randomforest, lightgbm, catboost, ...) no es necesario eliminar colinealidad entre variables. En otros algoritmos como glm si es necesario eliminar la colinealidad
corr = pd_loan_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 467 | fico_range_high | fico_range_low | 1.000000 |
| 561 | loan_amnt | installment | 0.952669 |
| 619 | earliest_cr_line_year | mo_sin_old_rev_tl_op | 0.916085 |
| 76 | mths_since_recent_revol_delinq | mths_since_recent_bc_dlq | 0.893737 |
| 103 | mths_since_last_delinq | mths_since_recent_revol_delinq | 0.867463 |
| 101 | mths_since_last_delinq | mths_since_recent_bc_dlq | 0.772392 |
| 102 | mths_since_last_delinq | mths_since_last_major_derog | 0.693772 |
| 516 | open_acc | total_acc | 0.690049 |
| 539 | pub_rec | pub_rec_bankruptcies | 0.638374 |
Decido en esta primera iteración, no eliminar correlaciones. Sin embargo, si finalmente aplico algún algoritmo que lo requiera, tendré que eliminar correlaciones
¿Son todos los nulos de una clase de la variable objetivo? o tienen el mismo porcentaje de la variable objetivo?
list_var_continuous
['mths_since_last_record', 'mths_since_recent_bc_dlq', 'mths_since_last_major_derog', 'mths_since_recent_revol_delinq', 'mths_since_last_delinq', 'emp_length', 'mo_sin_old_il_acct', 'revol_util', 'dti', 'term', 'int_rate', 'installment', 'sub_grade', 'mort_acc', 'pub_rec_bankruptcies', 'annual_inc', 'total_acc', 'fico_range_low', 'fico_range_high', 'mo_sin_old_rev_tl_op', 'open_acc', 'pub_rec', 'loan_amnt', 'earliest_cr_line_month', 'earliest_cr_line_year']
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='loan_status')
| Fully Paid | Charged Off | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.784308 | 0.215692 | mths_since_last_record | 287632 | 0.812398 |
| 1 | 0.783524 | 0.216476 | mths_since_recent_bc_dlq | 261211 | 0.737774 |
| 2 | 0.786232 | 0.213768 | mths_since_last_major_derog | 250094 | 0.706374 |
| 3 | 0.782884 | 0.217116 | mths_since_recent_revol_delinq | 226529 | 0.639817 |
| 4 | 0.786621 | 0.213379 | mths_since_last_delinq | 172430 | 0.487017 |
| 5 | 0.710647 | 0.289353 | emp_length | 19198 | 0.054224 |
| 6 | 0.767889 | 0.232111 | mo_sin_old_il_acct | 10146 | 0.028657 |
| 7 | 0.817647 | 0.182353 | revol_util | 170 | 0.000480 |
| 8 | 0.923077 | 0.076923 | dti | 13 | 0.000037 |
Dependiendo del contexto donde procedan las variables, se podrán imputar de una manera u otra. Por ejemplo, valores nulos dentro de la variable número de cuentas bancarias puede significar que la persona no tiene cuentas bancarias, por lo que el valor conveniente para rellenar el nulo es cero.
En caso de no tener tanto contexto de las variables, es posible optar por diferentes técnicas y contrastar los resultados del modelo:
Algunos algoritmos aceptan en su input valores missing
eliminar todas las filas que tengan valores nulos. En nuestro dataset no es lo más optimo debido a que hay bastantes filas que les ocurre esta situación
Decido rellenar todas las columnas continuas menos revol_util por el valor -99. De esta manera, diferencio los outlier del resto de la muestra poninendo un valor muy separado del resto de la variable. Se puede explorar el resultado del modelo utilizando diferentes métodos
#pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.mean()))
# pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.median()))
# pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.std()))
# pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.max()))
# pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.min()))
list_vars = list(set(list_var_continuous)-set(['revol_util']))
pd_loan_train[list_vars] = pd_loan_train[list_vars].fillna(-99)
pd_loan_test[list_vars] = pd_loan_test[list_vars].fillna(-99)
list_vars
['total_acc', 'fico_range_high', 'mths_since_last_delinq', 'fico_range_low', 'mths_since_last_major_derog', 'mo_sin_old_rev_tl_op', 'mths_since_recent_bc_dlq', 'pub_rec_bankruptcies', 'sub_grade', 'mths_since_last_record', 'loan_amnt', 'earliest_cr_line_month', 'mo_sin_old_il_acct', 'dti', 'emp_length', 'open_acc', 'installment', 'earliest_cr_line_year', 'mort_acc', 'term', 'pub_rec', 'annual_inc', 'mths_since_recent_revol_delinq', 'int_rate']
pd_loan_test['revol_util'].isnull().sum()
44
get_percent_null_values_target(pd_loan_test, list_var_continuous, target='loan_status')
| Fully Paid | Charged Off | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.681818 | 0.318182 | revol_util | 44 | 0.000497 |
Vamos a usar KNNImputer para imputar los valores missing de la variable emp_length usando como regresoras todas las variables continuas
X_train = pd_loan_train[list(set(list_var_continuous))]
X_test = pd_loan_test[list(set(list_var_continuous))]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
model = imputer.fit(X_train)
pd_input_train = pd.DataFrame(model.transform(X_train),
columns=[i+'_input' for i in list(set(list_var_continuous))],index=pd_loan_train.index)
pd_input_test = pd.DataFrame(model.transform(X_test),
columns=[i+'_input' for i in list(set(list_var_continuous))],index=pd_loan_test.index)
pd_loan_input_train = pd.concat([pd_loan_train, pd_input_train],axis=1).drop(list(set(list_var_continuous)),axis=1)
pd_loan_input_test = pd.concat([pd_loan_test, pd_input_test],axis=1).drop(list(set(list_var_continuous)),axis=1)
pd_loan_input_train.shape
(354053, 33)
get_percent_null_values_target(pd_loan_input_train, [i+'_input' for i in list_var_continuous], target='loan_status')
No existen variables con valores nulos
list_var_continuous = list(pd_loan_input_train.select_dtypes('float').columns)
get_corr_matrix(dataset = pd_loan_input_train[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
pd_loan_input_train.columns
Index(['grade', 'home_ownership', 'verification_status', 'initial_list_status',
'purpose', 'addr_state', 'application_type', 'loan_status',
'total_acc_input', 'fico_range_high_input',
'mo_sin_old_rev_tl_op_input', 'pub_rec_bankruptcies_input',
'sub_grade_input', 'mths_since_last_record_input', 'loan_amnt_input',
'earliest_cr_line_month_input', 'mo_sin_old_il_acct_input',
'installment_input', 'earliest_cr_line_year_input', 'term_input',
'mths_since_recent_revol_delinq_input', 'mths_since_last_delinq_input',
'fico_range_low_input', 'mths_since_last_major_derog_input',
'revol_util_input', 'mths_since_recent_bc_dlq_input', 'dti_input',
'emp_length_input', 'open_acc_input', 'pub_rec_input',
'annual_inc_input', 'mort_acc_input', 'int_rate_input'],
dtype='object')
Para la correlacion de spearman es necesario convertir las variables categoricas en numericas y luego obtener la correlación
También está el coeficiente V-Cramer https://stackoverflow.com/questions/46498455/categorical-features-correlation
list_var_cat
['loan_status', 'grade', 'home_ownership', 'verification_status', 'initial_list_status', 'purpose', 'addr_state', 'application_type']
confusion_matrix = pd.crosstab(pd_loan_input_train["loan_status"], pd_loan_input_train["grade"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
grade A B C D E F G loan_status Charged Off 3904 13457 23412 18652 12400 4485 1327 Fully Paid 55032 81828 76607 38818 17717 5067 1347
0.2736022015514017
confusion_matrix = pd.crosstab(pd_loan_input_train["loan_status"], pd_loan_input_train["loan_status"])
cramers_v(confusion_matrix.values)
0.9999917508806655
confusion_matrix = pd.crosstab(pd_loan_input_train["loan_status"], pd_loan_input_train["home_ownership"])
cramers_v(confusion_matrix.values)
0.07724234452719861
# despues de hacer encoding de las variables categoricas, es posible obtener la correlacion de spearman
# get_corr_matrix(dataset = pd_loan_input[list_var_cat],
# metodo='spearman', size_figure=[10,8])
En las variables categoricas, los valores nulos se suelen sustituir por una nueva clase: "sin valor" o por la moda
pd_loan_input_train[list_var_cat] = pd_loan_input_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
pd_loan_input_test[list_var_cat] = pd_loan_input_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
pd_loan_input_train.to_csv("./data_preprocessing/train_pd_data_preprocessing_missing_outlier.csv")
pd_loan_input_test.to_csv("./data_preprocessing/test_pd_data_preprocessing_missing_outlier.csv")
print(pd_loan_input_train.shape, pd_loan_input_test.shape)
(354053, 33) (88514, 33)